We want to determine the extent to which the MIKK panel shows evidence of introgression with other medaka populations, specifically the allopatric Northern and Korean medaka strains, relative to sympatric Southern medaka strains.

Setup

library(here)
source(here::here("code", "scripts", "introgression", "source.R"))

Working directory

Working directory on EBI cluster: /hps/research1/birney/users/ian/mikk_paper

Create Singularity containers

container_dir=../sing_conts
# load Singularity (version 3.5.0)
module load singularity 

for package in $( echo r-base_4.0.4 bcftools_1.9 numpy_1.15.4 bash_3.0.22 bioconductor_3.12 ) ; do
  if [[ ! -f $container_dir/$package.sif ]]; then
    singularity build \
      --remote \
      $container_dir/$package.sif \
      envs/$package.def 
  fi ;
done

renv

Install all required packages (using r-base in Singularity container).

container_dir=../sing_conts
baseR=r-base_4.0.4

bsub -Is "singularity shell $container_dir/$baseR"
# Install all required packages
renv::restore()

Copy scripts from Simon Martin’s GitHub repo

wget -P code/scripts/introgression https://raw.githubusercontent.com/simonhmartin/genomics_general/master/ABBABABAwindows.py

wget -P code/scripts/introgression https://raw.githubusercontent.com/simonhmartin/genomics_general/master/genomics.py

Process multiple alignment data

Download from Ensembl

  • Ensembl 103: Only includes Oryzias latipes HdrR and Oryzias melastigma (Indian medaka).
  • Ensembl 102: Includes all target species. Use.

ftp://ftp.ensembl.org/pub/release-102/emf/ensembl-compara/multiple_alignments/50_fish.epo/README.50_fish.epo reads:

Alignments are grouped by japanese medaka hdrr chromosome, and then by coordinate system. Alignments containing duplications in japanese medaka hdrr are dumped once per duplicated segment. The files named .other.emf contain alignments that do not include any japanese medaka hdrr region. Each file contains up to 200 alignments.

ftp_dir=ftp://ftp.ensembl.org/pub/release-102/emf/ensembl-compara/multiple_alignments/50_fish.epo/
target_dir=../introgression/release-102

mkdir -p $target_dir/raw

# download, exlcuding *.other* files
wget -P $target_dir/raw $ftp_dir/* -R "*other*"

# unzip into new directory (excluding "other")
$target_dir/unzipped
mkdir -p $target_dir/unzipped

for i in $(find $target_dir/raw/50_fish.epo.[0-9]*); do
  name=$(basename $i | cut -f3,4 -d'.');
  bsub "zcat $i > $target_dir/unzipped/$name";
done
  • NOTE: File 6_2.emf is in a completely different format, with CIGAR strings instead of the normal SEQ, TREE, ID and DATA segments. It appears the file is corrupted.
  • The 6_2 file in release 101 is unaffected. Remove all release 102 chr 6 files and replace with release 101 files.
# remove release 102 files for chr 6
rm $target_dir/unzipped/6_*

# download chr 6 files from release 101
ftp_dir_101=ftp://ftp.ensembl.org/pub/release-101/emf/ensembl-compara/multiple_alignments/50_fish.epo/
target_dir_101=../introgression/release-101

mkdir -p $target_dir_101/raw

wget -P $target_dir_101/raw $ftp_dir_101/50_fish.epo.6_*

# unzip
mkdir -p $target_dir_101/unzipped

for i in $(find $target_dir_101/raw/*); do
  name=$(basename $i | cut -f3,4 -d'.') ;
  zcat $i > $target_dir_101/unzipped/$name ;
done

# copy over to release 102 directory
cp $target_dir_101/unzipped/* $target_dir/unzipped

Generate tree plot

Copy tree to file

tree_file=data/introgression/release_102_tree.txt

awk "NR==58,NR==205" $target_dir/raw/README.50_fish.epo \
  > $tree_file

Then manually edit $tree_file using regex to find spaces and replace them with "_": {bash} (?<=[a-z])( )(?=[a-z])

phylo_tree <- ape::read.tree(file = here::here("data", "introgression", "release_102_tree.txt"))

Full tree

# Colour all Oryzias
ids <- phylo_tree$tip.label[grep("Oryzias", phylo_tree$tip.label)]
# get indices of edges descending from MRCA (determined through trial and error)
oryzias_nodes <- seq(35, 42)
all_med_col <- ifelse(1:length(phylo_tree[["edge.length"]]) %in% oryzias_nodes, "#E84141", "black")
# set colours for tip labels
all_med_tip <- ifelse(phylo_tree$tip.label %in% ids, "#E84141", "black")
# plot
ape::plot.phylo(phylo_tree,
                use.edge.length = T,
                edge.color = all_med_col,
                tip.color = all_med_tip,
                font = 4)

# Save to repo
png(file= file.path(plots_dir, "tree_all.png"),
    width=22,
    height=25,
    units = "cm",
    res = 400)
ape::plot.phylo(phylo_tree,
                use.edge.length = T,
                edge.color = all_med_col,
                tip.color = all_med_tip,
                font = 4)
dev.off()

Oryzias only

New tree file created manually to extract Oryzias fishes only, and replace reference codes (e.g. “ASM223467v1”) with line names (e.g. “HdrR”).

in_file = here::here("data/introgression/release_102_tree_oryzias_only.txt")
# Read in
phylo_tree <- ape::read.tree(file = in_file)
# Set colours 
phylo_cols <- c("#55b6b0", "#f33a56", "#f3b61f", "#f6673a", "#631e68")
# Plot
ape::plot.phylo(phylo_tree,
                font = 4,
                tip.color = phylo_cols)

out_file = here::here(plots_dir, "tree_oryzias.png")
# Save
png(file=out_file,
    width=2700,
    height=1720,
    units = "px",
    res = 400)
ape::plot.phylo(phylo_tree,
                font = 4,
                tip.color = phylo_cols)
dev.off()

Add cross for ancestor

out_file = here::here(plots_dir, "tree_oryzias_with_ancestor.png")
in_file = here::here(plots_dir, "tree_oryzias.png")

tree_path = in_file

ggdraw() +
  draw_image(tree_path) +
  draw_label("x",
             x = 0.152,
             y = 0.52,
             fontface = "bold",
             color = "#f77cb5",
             size = 25)

ggsave(out_file,
    width=15.69,
    height=10,
    units = "cm",
    dpi = 400)
knitr::include_graphics(basename(out_file))

Divide by segment

target_dir=../introgression/release-102
segments_dir=$target_dir/segmented
date=20210312
script=code/scripts/introgression/20200907_extract-emf-segments.sh

mkdir -p $segments_dir

for i in $(find $target_dir/unzipped/* ); do
  # get basename
  bname=$(basename $i);
  bname_short=$(echo ${bname::-4} );
  # get chromosome
  chr=$(echo $bname | cut -f1 -d"_" );
  # make directory for each EMF file
  new_path=$(echo $segments_dir/$bname_short );
  if [ ! -d "$new_path" ]; then
    mkdir $new_path;
  fi
  # get segment count
  segment_count=$(grep "^DATA" $i | wc -l );
  # get segment start and end for each file
  for j in $(seq 1 $segment_count ); do
    bsub \
      -o ../log/$date\segment_$bname_short\_$j.out \
      -e ../log/$date\segment_$bname_short\_$j.err \
      "$script $i $j $new_path "
  done;  
done

# How many files?
find $segments_dir/*/*.data.txt | wc -l
# 8951
find $in_dir/*/*_1.data.txt | wc -l
# 4341
find $in_dir/*/*_-1.data.txt | wc -l
# 4610

Run analysis pipeline with snakemake

snmk_proj="introgression"

module load singularity
conda activate snakemake

snakemake \
 --jobs 5000 \
 --latency-wait 100 \
 --cluster-config code/snakemake/$snmk_proj/config/cluster.json \
 --cluster 'bsub -g /snakemake_bgenie -J {cluster.name} -n {cluster.n} -M {cluster.memory} -o {cluster.output} -e {cluster.error}' \
 --keep-going \
 --rerun-incomplete \
 --use-conda \
 --use-singularity \
 -s code/snakemake/$snmk_proj/Snakefile \
 -p

f statistic analysis

Read in data

data_file = here::here("data", "introgression", "20210315_f_stat_final.txt")
# Read in data
final_df <- read.table(data_file,
                       header = T,
                       sep = "\t",
                       as.is = T)

final_df <- final_df %>% 
  dplyr::mutate(across(P2,
                       ~factor(.x, levels = fish_order))) %>% 
  dplyr::mutate(chr = factor(chr, levels = chr_order))

knitr::kable(head(final_df))
P1 P2 P3 chr d_stat z_score admix_f f_ci_lower f_ci_upper
javanicus HdrR MIKK all 0.9072306 324.06735 0.7093524 0.6943606 0.7243441
javanicus HdrR MIKK 1 0.9422313 162.46960 0.8090713 0.7776743 0.8404682
javanicus HdrR MIKK 2 0.9128441 57.55129 0.7668972 0.6931469 0.8406474
javanicus HdrR MIKK 3 0.8869725 76.53532 0.6217246 0.5501172 0.6933320
javanicus HdrR MIKK 4 0.9318431 72.86056 0.7503846 0.6643958 0.8363734
javanicus HdrR MIKK 5 0.8879805 69.67407 0.6565419 0.5865893 0.7264945

Create DF with mean melastigma and javanicus

cor_df <- final_df %>% 
  # filter for when P1 is another Oryzias, and P2 
  dplyr::filter(P1 %in% c("javanicus", "melastigma") & P2 != "MIKK" & P3 == "MIKK") %>% 
  # pivot to put the admixture_f stat for melastigma and javanicus in the same row
  tidyr::pivot_wider(id_cols = c("P2", "chr"),
                     names_from = P1,
                     values_from = c(admix_f, f_ci_lower, f_ci_upper))

cor_df$chr <- as.character(cor_df$chr)
cor_df$chr <- ifelse(cor_df$chr == "all", "genome-wide", cor_df$chr)
chr_order_plot <- c(seq(1,24), "genome-wide")
cor_df$chr <- factor(cor_df$chr, levels = chr_order_plot)

cor_df_means <- cor_df %>%
  # apply across rows
  dplyr::rowwise() %>% 
  # get means for f and CIs
  dplyr::mutate(mean_f = mean(c(admix_f_javanicus, admix_f_melastigma)),
                mean_ci_upper = mean(c(f_ci_upper_javanicus, f_ci_upper_melastigma)),
                mean_ci_lower = mean(c(f_ci_lower_javanicus, f_ci_lower_melastigma))) %>% 
  # set stats at a maximum of 1
  dplyr::mutate(across(c("mean_f", "mean_ci_upper", "mean_ci_lower"),
                       ~dplyr::if_else(.x > 1,
                                       1,
                                       .x)))

knitr::kable(head(cor_df_means))
P2 chr admix_f_javanicus admix_f_melastigma f_ci_lower_javanicus f_ci_lower_melastigma f_ci_upper_javanicus f_ci_upper_melastigma mean_f mean_ci_upper mean_ci_lower
HdrR genome-wide 0.7093524 0.6860648 0.6943606 0.6712495 0.7243441 0.7008800 0.6977086 0.7126121 0.6828051
HdrR 1 0.8090713 0.7831495 0.7776743 0.7466954 0.8404682 0.8196036 0.7961104 0.8300359 0.7621849
HdrR 2 0.7668972 0.7621796 0.6931469 0.6999464 0.8406474 0.8244128 0.7645384 0.8325301 0.6965466
HdrR 3 0.6217246 0.5968322 0.5501172 0.5217981 0.6933320 0.6718664 0.6092784 0.6825992 0.5359576
HdrR 4 0.7503846 0.7399951 0.6643958 0.6811117 0.8363734 0.7988785 0.7451899 0.8176260 0.6727538
HdrR 5 0.6565419 0.6241430 0.5865893 0.5565435 0.7264945 0.6917426 0.6403425 0.7091186 0.5715664

Plot

fstat_plot = cor_df_means %>% 
  ggplot(aes(P2, mean_f, fill = P2)) +
    geom_col() +
    geom_errorbar(aes(ymin = mean_ci_lower,
                      ymax = mean_ci_upper),
                  position = position_dodge(0.9),
                  width = 0.25) +  
    guides(fill = F) +
    facet_wrap(~chr) +
    ylim(0,1) +
    ylab(expression(paste("Mean ", italic("f"), " statistic"))) +
    theme_cowplot(font_size = 8) +
    scale_fill_manual(values = pal_abba)

fstat_plot

out_file = here::here(plots_dir, "20210315_f_stat")

# PNG
ggsave(filename = paste(out_file, ".png", sep = ""),
       device = "png",
       width = 24.75,
       height = 19.5,
       units = "cm",
       dpi = 500)

# SVG
ggsave(filename = paste(out_file, ".svg", sep = ""),
       device = "svg",
       width = 24.75,
       height = 19.5,
       units = "cm")

Sliding windows ABBA BABA

Read in data

in_file = here::here("data", "introgression", "abba_sliding_final", "50000_100.txt")
# Read in data
df = readr::read_csv(in_file) %>% 
  dplyr::arrange(p1, p2, scaffold, start)

# Convert fd to 0 if D < 0
df$fd = ifelse(df$D < 0,
               0,
               df$fd)

# Change names
df = df %>% 
  dplyr::mutate(p2 = recode(df$p2, hdrr = "HdrR", hni = "HNI", hsok = "HSOK"))

Plot

Standard

df %>% 
  dplyr::filter(p1 == "melastigma") %>% 
  ggplot() +
    geom_line(aes(mid, fd, colour = p2)) +
    facet_wrap(~scaffold, nrow = 24, ncol = 1) +
    scale_colour_manual(values = pal_abba) +
    theme_bw(base_size = 10) +
    scale_x_continuous(breaks = c(0, 5000000, 10000000, 15000000, 20000000, 25000000, 30000000, 35000000),
                       labels = scales::comma) +
    xlab("Base position") +
    ylab(bquote(italic(f[d]))) +
    labs(colour = "P2")

out_file = here::here(plots_dir, "20210317_abba_sliding.png")

ggsave(filename = out_file,
       device = "png",
       width = 24.75,
       height = 50,
       units = "cm",
       dpi = 300)

Karyoplot

Make custom chromosome scaffold

# Get chromosome lengths
med_chr_lens = read.table(here("data", "Oryzias_latipes.ASM223467v1.dna.toplevel.fa_chr_counts.txt"),
                          col.names = c("chr", "end"))
# Add start
med_chr_lens$start = 1
# Reorder
med_chr_lens = med_chr_lens %>% 
  dplyr::select(chr, start, end)
# Create custom genome
med_genome = regioneR::toGRanges(med_chr_lens)

Process ABBA sliding windows data

in_file = here::here("data", "introgression", "abba_sliding_final", "50000_100.txt")
# Read in data
df = readr::read_csv(in_file) %>% 
  dplyr::arrange(p1, p2, scaffold, start)

# Convert fd to 0 if D < 0
df$fd = ifelse(df$D < 0,
               0,
               df$fd)

# Change names
df = df %>% 
  dplyr::mutate(p2 = recode(df$p2, hdrr = "HdrR", hni = "HNI", hsok = "HSOK"))

# Get df with mean of melastigma/javanicus
df_kp = df %>% 
  pivot_wider(id_cols = c(scaffold, start, end, mid, p2), names_from = p1, values_from = fd) %>%
  # get mean of melastigma/javanicus
  dplyr::mutate(mean_fd = rowMeans(dplyr::select(., melastigma, javanicus), na.rm = T)) %>% 
  dplyr::arrange(p2, scaffold, start)

knitr::kable(head(df_kp))
scaffold start end mid p2 javanicus melastigma mean_fd
1 650001 700000 666130 HdrR NA 0.1231 0.12310
1 700001 750000 721651 HdrR NA 0.2357 0.23570
1 750001 800000 774678 HdrR NA 0.1636 0.16360
1 950001 1000000 974734 HdrR NA 0.2203 0.22030
1 1000001 1050000 1028780 HdrR NA 0.2935 0.29350
1 1050001 1100000 1075868 HdrR 0.4066 0.2985 0.35255

Read in SNP density data

HNI and HSOK
in_file = here::here("data/introgression/hni_hsok.txt.gz")
# Read in file on local
ol_ranges_df = read.table(in_file,
                          header = T, 
                          sep = "\t", 
                          as.is = T)

ol_ranges_df_long = ol_ranges_df %>% 
  tidyr::pivot_longer(cols = c(hni, hsok), names_to = "line", values_to = "present")

ol_ranges_list = split(ol_ranges_df_long, f = ol_ranges_df_long$line)

ol_ranges_list = lapply(ol_ranges_list, function(x){
  # remove NAs
  df = x %>% 
    tidyr::drop_na(present)
  # convert to GRanges object
  ol_ranges = GenomicRanges::makeGRangesFromDataFrame(df,
                                                      ignore.strand = T,
                                                      seqnames.field = "chr",
                                                      start.field = "pos",
                                                      end.field = "pos")
  return(ol_ranges)
})
MIKK
in_file = here::here("data/introgression/mikk.txt.gz")
# Read in file on local
mikk_ranges_df = read.table(in_file,
                            col.names = c("chr", "pos"), 
                            sep = "\t", 
                            as.is = T)

# Convert to GRanges object

mikk_ranges = GenomicRanges::makeGRangesFromDataFrame(mikk_ranges_df,
                                                      ignore.strand = T,
                                                      seqnames.field = "chr",
                                                      start.field = "pos",
                                                      end.field = "pos")

Get exon density

# Get list of exons from biomaRt

## Select dataset
olat_mart = biomaRt::useEnsembl(biomart = "ensembl", dataset = "olatipes_gene_ensembl")
## Get attributes of interest (exon ID, chr, start, end)
exons <- biomaRt::getBM(attributes = c("chromosome_name", "ensembl_gene_id", "ensembl_transcript_id", "transcript_start", "transcript_end", "transcript_length", "ensembl_exon_id", "rank", "strand", "exon_chrom_start", "exon_chrom_end", "cds_start", "cds_end"),
               mart = olat_mart)
## Convert exons to GRanges
ex_ranges = GenomicRanges::makeGRangesFromDataFrame(exons,
                                                    ignore.strand = T,
                                                    seqnames.field = "chromosome_name",
                                                    start.field = "exon_chrom_start",
                                                    end.field = "exon_chrom_end")

All chromosomes

file_out = file.path(plots_dir, "20210318_fd_with_density_all.png")
# Save
png(file=file_out,
    width=8500,
    height=13500,
    units = "px",
    res = 400)
# Plot 
kp = plotKaryotype(med_genome)
# Add base numbers 
karyoploteR::kpAddBaseNumbers(kp, tick.dist = 5000000, cex = 0.3)
# Add data backgrounds
karyoploteR::kpDataBackground(kp, r0=0, r1 = 1, color = "white")
# Add axis label
kpAxis(kp, r0=0.3, r1 = 1, cex = 0.4)
# Add fd data
karyoploteR::kpLines(kp,
                     chr = df_kp$scaffold[df_kp$p2 == "HNI"],
                     x = df_kp$mid[df_kp$p2 == "HNI"],
                     y = df_kp$mean_fd[df_kp$p2 == "HNI"],
                     col = "#F6673A",
                     r0=0.3, r1 = 1)
karyoploteR::kpLines(kp,
                     chr = df_kp$scaffold[df_kp$p2 == "HdrR"],
                     x = df_kp$mid[df_kp$p2 == "HdrR"],
                     y = df_kp$mean_fd[df_kp$p2 == "HdrR"],
                     col = "#F3B61F",
                     r0=0.3, r1 = 1)
karyoploteR::kpLines(kp,
                     chr = df_kp$scaffold[df_kp$p2 == "HSOK"],
                     x = df_kp$mid[df_kp$p2 == "HSOK"],
                     y = df_kp$mean_fd[df_kp$p2 == "HSOK"],
                     col = "#631E68",
                     r0=0.3, r1 = 1)
# Add SNP density data
kpPlotDensity(kp, data=mikk_ranges, col = "#49A379",
              r0=0, r1=0.1, 
              window.size = 25000)
kpPlotDensity(kp, data=ol_ranges_list$hni, col = "#F6673A",
              r0=0.1, r1=0.2, 
              window.size = 25000)
kpPlotDensity(kp, data=ol_ranges_list$hsok, col = "#631E68", 
              r0=0.2, r1=0.3, 
              window.size = 25000)
#kpPlotDensity(kp, data=ol_ranges_list$hdrr, col = "#F3B61F",
#              r0=0.45, r1=0.6,
#              window.size = 25000)
# Add exon density to ideogram
kpPlotDensity(kp, data=ex_ranges, col = "#f77cb5",
              data.panel = "ideogram",
              window.size = 25000,
              r0 = 0.5, r1 = 1)
kpPlotDensity(kp, data=ex_ranges, col = "#f77cb5",
              data.panel = "ideogram",
              window.size = 25000,
              r0 = 0.5, r1 = 0)
# Add labels
kpAddLabels(kp, labels="MIKK",
            r0=0, r1=0.05,
            label.margin = 0.001,
            cex = 0.4)
kpAddLabels(kp, labels="HNI",
            r0=0.1, r1=0.15, 
            label.margin = 0.001,
            cex = 0.4)
kpAddLabels(kp, labels="HSOK",
            r0=0.2, r1=0.25,
            label.margin = 0.001,
            cex = 0.4)
#kpAddLabels(kp, labels="HdrR",
#            r0=0.45, r1=0.6, 
#            cex = 0.4)
kpAddLabels(kp, labels=bquote(italic(f[d])),
            r0=0.3, r1=1, 
            label.margin = 0.035,
            cex = 0.6)
dev.off()
knitr::include_graphics(basename(file_out))

Chromosome 4

out_file = file.path(plots_dir, "20210318_fd_with_density_chr_4.png")
png(file=out_file,
    width=5500,
    height=1186,
    units = "px",
    res = 400)

# Plot 
kp = plotKaryotype(med_genome, chromosomes = "4", cex = 1.5)
# Add base numbers 
karyoploteR::kpAddBaseNumbers(kp, tick.dist = 5000000, cex = 0.7)
# Add data backgrounds
karyoploteR::kpDataBackground(kp, r0=0, r1 = 1, color = "white")
# Add axis label
kpAxis(kp, r0=0.3, r1 = 1, cex = 0.8)
# Add fd data
lwd = 2
karyoploteR::kpLines(kp,
                     chr = df_kp$scaffold[df_kp$p2 == "HNI"],
                     x = df_kp$mid[df_kp$p2 == "HNI"],
                     y = df_kp$mean_fd[df_kp$p2 == "HNI"],
                     col = "#F6673A",
                     r0=0.3, r1 = 1,
                     lwd = lwd)
karyoploteR::kpLines(kp,
                     chr = df_kp$scaffold[df_kp$p2 == "HdrR"],
                     x = df_kp$mid[df_kp$p2 == "HdrR"],
                     y = df_kp$mean_fd[df_kp$p2 == "HdrR"],
                     col = "#F3B61F",
                     r0=0.3, r1 = 1,
                     lwd = lwd)
karyoploteR::kpLines(kp,
                     chr = df_kp$scaffold[df_kp$p2 == "HSOK"],
                     x = df_kp$mid[df_kp$p2 == "HSOK"],
                     y = df_kp$mean_fd[df_kp$p2 == "HSOK"],
                     col = "#631E68",
                     r0=0.3, r1 = 1,
                     lwd = lwd)
# Add SNP density data
kpPlotDensity(kp, data=mikk_ranges, col = "#49A379",
              r0=0, r1=0.1, 
              window.size = 25000)
kpPlotDensity(kp, data=ol_ranges_list$hni, col = "#F6673A",
              r0=0.1, r1=0.2, 
              window.size = 25000)
kpPlotDensity(kp, data=ol_ranges_list$hsok, col = "#631E68", 
              r0=0.2, r1=0.3, 
              window.size = 25000)
#kpPlotDensity(kp, data=ol_ranges_list$hdrr, col = "#F3B61F",
#              r0=0.45, r1=0.6,
#              window.size = 25000)
# Add exon density to ideogram
kpPlotDensity(kp, data=ex_ranges, col = "#f77cb5",
              data.panel = "ideogram",
              window.size = 25000,
              r0 = 0.5, r1 = 1)
kpPlotDensity(kp, data=ex_ranges, col = "#f77cb5",
              data.panel = "ideogram",
              window.size = 25000,
              r0 = 0.5, r1 = 0)
# Add labels
kpAddLabels(kp, labels="MIKK",
            r0=0, r1=0.05,
            label.margin = 0.001,
            cex = 0.5)
kpAddLabels(kp, labels="HNI",
            r0=0.1, r1=0.15, 
            label.margin = 0.001,
            cex = 0.5)
kpAddLabels(kp, labels="HSOK",
            r0=0.2, r1=0.25,
            label.margin = 0.001,
            cex = 0.5)
#kpAddLabels(kp, labels="HdrR",
#            r0=0.45, r1=0.6, 
#            cex = 0.4)
kpAddLabels(kp, labels=bquote(italic(f[d])),
            r0=0.3, r1=1, 
            label.margin = 0.035,
            cex = 1)
dev.off()
knitr::include_graphics(basename(out_file))

Final figure

ABBA BABA diagram

Created with Vectr and saved here: plots/introgression/20210318_abba_diagram.svg

Compile all

abba_diagram = here::here(plots_dir, "abba_diagram.png")
tree = here::here(plots_dir, "tree_oryzias_with_ancestor.png")
karyo_chr4 = here::here(plots_dir, "20210318_fd_with_density_chr_4.png")

final_abba = ggdraw() +
  draw_image(tree,
            x = 0, y = .7, width = .4, height = .35, vjust = .1, hjust = -.1, scale = 1.2) +
  draw_image(karyo_chr4,
             x = 0, y = 0, width = 1, height = 0.3, scale = 1.2) +   
  draw_plot(fstat_plot,
             x = .4, y = .3, width = .6, height = .7) +
  draw_image(abba_diagram,
          x = 0, y = .3, width = .4, height = .35, vjust = -.05, scale = 1.1) +
  draw_plot_label(label = c("A", "B", "C", "D"), size = 16,
                  x = c(0, 0, .38, 0), y = c(1, .7, 1, .3))  


final_abba

NA
NA
ggsave(here::here(plots_dir, "20210318_final_figure.png"),
       width = 35,
       height = 21.875,
       units = "cm",
       dpi = 500)

New final figure with circos

Circos

Read in data

target_file = here::here("data/introgression/abba_sliding_final_with_icab/min-sites-250.txt")

mikk_abba_final = readr::read_csv(target_file) %>% 
  # add sliding window length
  dplyr::mutate(window_length_kb = (end - start + 1) / 1000) %>% 
  # filter for 500 kb windows
  dplyr::filter(window_length_kb == 500) %>% 
  # recode `fd` as 0 if `D` is negative
  dplyr::mutate(fd = ifelse(D < 0, 0, fd))

Sanity check with counts of sites:

readr::read_csv(target_file) %>% 
  # add sliding window length
  dplyr::mutate(window_length_kb = (end - start + 1) / 1000) %>% 
  # filter for 500 kb windows
  dplyr::filter(window_length_kb == 500) %>% 
  dplyr::count(p1, p2)

── Column specification ────────────────────────────────────────────────────────────────────────────────────────
cols(
  scaffold = col_double(),
  start = col_double(),
  end = col_double(),
  mid = col_double(),
  sites = col_double(),
  sitesUsed = col_double(),
  ABBA = col_double(),
  BABA = col_double(),
  D = col_double(),
  fd = col_double(),
  fdM = col_double(),
  p1 = col_character(),
  p2 = col_character()
)

As suggested by Simon Martin here: https://github.com/simonhmartin/genomics_general#abba-baba-statistics-in-sliding-windows > fd gives meaningless values (<0 or >1) if D is negative. If there is no excess of shared derived alleles between P2 and P3 (indicated by a positive D), then the excess cannot be quantified. fd values for windows with negative D should therefore either be discarded or converted to zero, depending on your hypothesis.

How many windows have D > 0?

readr::read_csv(target_file) %>% 
  # add sliding window length
  dplyr::mutate(window_length_kb = (end - start + 1) / 1000) %>% 
  # filter for 500 kb windows
  dplyr::filter(window_length_kb == 500) %>% 
  # recode lines
  dplyr::mutate(across(c("p1", "p2"), ~factor(.x, levels = c("icab", "hdrr", "hni", "hsok"))),
                across(c("p1", "p2"), ~recode(.x, icab = "iCab", hdrr = "HdrR", hni = "HNI", hsok = "HSOK"))) %>% 
  dplyr::group_by(p1, p2) %>% 
  dplyr::summarise(n_pos_d = length(which(D > 0))) %>% 
  ggplot() +
    geom_col(aes(p2, n_pos_d, fill = p2)) +
    facet_wrap(~p1) +
    scale_fill_manual(values = pal_abba) +
    xlab("P2") +
    ylab("Number of windows (sites) with positive D") +
    ggtitle("Choice of P1 (iCab or HdrR)") +
    theme_cowplot() +
    labs(fill = "P2")

ggsave(here::here("plots/introgression/20210429_p1_hdrr_v_icab_counts_valid_sites.png"),
       device = "png",
       width=15.69,
       height=10,
       units = "cm",
       dpi = 400)

Distributions of \(f_D\)

readr::read_csv(target_file) %>% 
  # add sliding window length
  dplyr::mutate(window_length_kb = (end - start + 1) / 1000) %>% 
  # filter for 500 kb windows
  dplyr::filter(window_length_kb == 500) %>% 
  # recode lines
  dplyr::mutate(across(c("p1", "p2"), ~factor(.x, levels = c("icab", "hdrr", "hni", "hsok"))),
                across(c("p1", "p2"), ~recode(.x, icab = "iCab", hdrr = "HdrR", hni = "HNI", hsok = "HSOK"))) %>%
  # remove all rows where D < 0
  dplyr::filter(D > 0) %>% 
  ggplot() +
    geom_histogram(aes(fd, fill = p2), bins = 50) +
    facet_wrap(vars(p1, p2)) +
    scale_fill_manual(values = pal_abba) +
    xlab("P2") +
#    ylab("Number of windows (sites) with positive D") +
    ggtitle(expression(italic(f[d]))) +
    theme_cowplot()  

ggsave(here::here("plots/introgression/20210429_p1_hdrr_v_icab_fd_distribution.png"),
       device = "png",
       width=15.69,
       height=10,
       units = "cm",
       dpi = 400)

Get mean fd for each population

readr::read_csv(target_file) %>% 
  # add sliding window length
  dplyr::mutate(window_length_kb = (end - start + 1) / 1000) %>% 
  # filter for 500 kb windows
  dplyr::filter(window_length_kb == 500) %>% 
  # recode lines
  dplyr::mutate(across(c("p1", "p2"), ~factor(.x, levels = c("icab", "hdrr", "hni", "hsok"))),
                across(c("p1", "p2"), ~recode(.x, icab = "iCab", hdrr = "HdrR", hni = "HNI", hsok = "HSOK"))) %>%
  # remove all rows where D < 0
  dplyr::filter(D > 0) %>% 
  dplyr::filter(p1 == "HdrR") %>% 
  dplyr::group_by(p2) %>% 
  dplyr::summarise(mean(fd))
mikk_abba_final = readr::read_csv(target_file) %>% 
  # add sliding window length
  dplyr::mutate(window_length_kb = (end - start + 1) / 1000) %>% 
  # filter for 500 kb windows
  dplyr::filter(window_length_kb == 500) %>% 
  # recode `fd` as 0 if `D` is negative
  dplyr::mutate(fd = ifelse(D < 0, 0, fd))

# Is iCab or HdrR closer to MIKK?
mikk_abba_final %>% 
  dplyr::filter(p2 %in% c("icab", "hdrr")) %>% 
  dplyr::group_by(p1, p2) %>% 
  dplyr::summarise(length(which(fd == 0)))

# So there are fewer 0s when HdrR is P1, which suggests that iCab is more closely related to the MIKK panel. But you want a P1 that is further away so that you'll get more data points, which is why we'll likely go with HdrR. 
mikk_abba_final = readr::read_csv(target_file) %>% 
  # add sliding window length
  dplyr::mutate(window_length_kb = (end - start + 1) / 1000) %>% 
  # filter for 500 kb windows
  dplyr::filter(window_length_kb == 500) %>% 
  # recode `fd` as 0 if `D` is negative
  dplyr::mutate(fd = ifelse(D < 0, 0, fd))
mikk_abba_final = mikk_abba_final %>% 
  # recode lines
  dplyr::mutate(p2 = factor(p2, levels = c("hdrr", "icab", "hni", "hsok")),
                p2 = recode(p2, hdrr = "HdrR", icab = "iCab", hni = "HNI", hsok = "HSOK")) %>% 
  dplyr::arrange(p2, scaffold, start) %>% 
  dplyr::select(scaffold, mid_1 = mid, mid_2 = mid, fd, p1, p2) %>% 
  dplyr::mutate(scaffold = paste("chr", scaffold, sep ="")) %>% 
  split(., f = .$p1) %>% 
  purrr::map(., function(P1) split(P1, f = P1$p2)) %>% 
  # Remove empty data frames for hdrr-hdrr and icab-icab population combinations
  purrr::map(., function(P1) P1[purrr::map_lgl(P1, function(P2) nrow(P2) != 0)])

Plot

out_plot = here::here("plots", "introgression", "20210427_introgression_circos_MIKK_ABBA_p1-hdrr.png")
target_list = mikk_abba_final[["hdrr"]]

png(out_plot,
    width = 20,
    height = 20,
    units = "cm",
    res = 500)

# Set parameters
## Decrease cell padding from default c(0.02, 1.00, 0.02, 1.00)
circos.par(cell.padding = c(0, 0, 0, 0),
           track.margin = c(0, 0),
           gap.degree = c(rep(1, nrow(chroms) - 1), 6))
# Initialize plot
circos.initializeWithIdeogram(chroms,
                              plotType = c("axis", "labels"),
                              major.by = 1e7,
                              axis.labels.cex = 0.25*par("cex"))

# Print label in center
text(0, 0, "MIKK panel\nintrogression with\niCab, HNI,\nand\nHSOK")

###############
# Introgression
###############
counter = 0

purrr::map(target_list, function(P2){
  # Set counter
  counter <<- counter + 1
  
  circos.genomicTrack(P2,
      panel.fun = function(region, value, ...){
        circos.genomicLines(region,
                            value[[1]],
                            col = pal_abba[[names(target_list[counter])]],
                            area = T,
                            border = karyoploteR::darker(pal_abba[[names(target_list[counter])]],
                                                         amount = 80))
        # Add baseline
        circos.xaxis(h = "bottom",
                     labels = F,
                     major.tick = F)
      },
      track.height = 0.1,
      bg.border = NA,
      ylim = c(0, 1))
  
  # Add axis for introgression
  circos.yaxis(side = "right",
             at = c(.5, 1),
             labels.cex = 0.25*par("cex"),
             tick.length = 2
             )
  
  # Add y-axis label for introgression
  if (counter == 2) {
  circos.text(0, 0.5,
              labels = expression(italic(f[d])),
              sector.index = "chr1",
#              facing = "clockwise",
              adj = c(3, 0.5),
              cex = 0.4*par("cex"))
  }
  
  # Add y-axis label for introgression
  circos.text(0, 0.5,
              labels = names(target_list)[counter],
              sector.index = "chr1",
              facing = "clockwise",
              adj = c(.5, 0),
              cex = 0.6*par("cex"))  
})

circos.clear()

dev.off()
knitr::include_graphics(basename(out_plot))

out_plot = here::here("plots", "introgression", "20210427_introgression_circos_MIKK_ABBA_p1-icab.png")
target_list = mikk_abba_final[["icab"]]

png(out_plot,
    width = 20,
    height = 20,
    units = "cm",
    res = 500)

# Set parameters
## Decrease cell padding from default c(0.02, 1.00, 0.02, 1.00)
circos.par(cell.padding = c(0, 0, 0, 0),
           track.margin = c(0, 0),
           gap.degree = c(rep(1, nrow(chroms) - 1), 6))
# Initialize plot
circos.initializeWithIdeogram(chroms,
                              plotType = c("axis", "labels"),
                              major.by = 1e7,
                              axis.labels.cex = 0.25*par("cex"))

# Print label in center
text(0, 0, "MIKK panel\nintrogression with\nHdrR, HNI,\nand\nHSOK")

###############
# Introgression
###############
counter = 0

purrr::map(target_list, function(P2){
  # Set counter
  counter <<- counter + 1
  
  circos.genomicTrack(P2,
      panel.fun = function(region, value, ...){
        circos.genomicLines(region,
                            value[[1]],
                            col = pal_abba[[names(target_list[counter])]],
                            area = T,
                            border = karyoploteR::darker(pal_abba[[names(target_list[counter])]]))
        # Add baseline
        circos.xaxis(h = "bottom",
                     labels = F,
                     major.tick = F)
      },
      track.height = 0.1,
      bg.border = NA,
      ylim = c(0, 1))
  
  # Add axis for introgression
  circos.yaxis(side = "right",
             at = c(.5, 1),
             labels.cex = 0.25*par("cex"),
             tick.length = 2
             )
  
  # Add y-axis label for introgression
  if (counter == 2) {
  circos.text(0, 0.5,
              labels = expression(italic(f[d])),
              sector.index = "chr1",
#              facing = "clockwise",
              adj = c(3, 0.5),
              cex = 0.4*par("cex"))
  }
  
  # Add y-axis label for introgression
  circos.text(0, 0.5,
              labels = names(target_list)[counter],
              sector.index = "chr1",
              facing = "clockwise",
              adj = c(.5, 0),
              cex = 0.6*par("cex"))  
})

circos.clear()

dev.off()
knitr::include_graphics(basename(out_plot))

Re-do final figure

Chr2

Read in new data

in_file = here::here("data/introgression/abba_sliding_final_no_131-1", "500000_250.txt")
# Read in data
df = readr::read_csv(in_file) %>% 
  dplyr::arrange(p1, p2, scaffold, start)

# Convert fd to 0 if D < 0
df$fd = ifelse(df$D < 0,
               0,
               df$fd)

# Change names
df = df %>% 
  dplyr::mutate(p2 = recode(df$p2, hdrr = "HdrR", hni = "HNI", hsok = "HSOK"))

# Get df with mean of melastigma/javanicus
df_kp = df %>% 
  pivot_wider(id_cols = c(scaffold, start, end, mid, p2), names_from = p1, values_from = fd) %>%
  # get mean of melastigma/javanicus
  dplyr::mutate(mean_fd = rowMeans(dplyr::select(., melastigma, javanicus), na.rm = T)) %>% 
  dplyr::arrange(p2, scaffold, start)

knitr::kable(head(df_kp))

Plot

chr4
out_file = file.path(plots_dir, "20210409_fd_with_density_chr_4_500kb-window.png")
png(file=out_file,
    width=5500,
    height=1186,
    units = "px",
    res = 400)

# Plot 
kp = plotKaryotype(med_genome, chromosomes = "4", cex = 1.5)
# Add base numbers 
karyoploteR::kpAddBaseNumbers(kp, tick.dist = 5000000, cex = 0.7)
# Add data backgrounds
karyoploteR::kpDataBackground(kp, r0=0, r1 = 1, color = "white")
# Add axis label
kpAxis(kp, r0=0.3, r1 = 1, cex = 0.8)
# Add fd data
lwd = 2
karyoploteR::kpLines(kp,
                     chr = df_kp$scaffold[df_kp$p2 == "HNI"],
                     x = df_kp$mid[df_kp$p2 == "HNI"],
                     y = df_kp$mean_fd[df_kp$p2 == "HNI"],
                     col = "#F6673A",
                     r0=0.3, r1 = 1,
                     lwd = lwd)
karyoploteR::kpLines(kp,
                     chr = df_kp$scaffold[df_kp$p2 == "HdrR"],
                     x = df_kp$mid[df_kp$p2 == "HdrR"],
                     y = df_kp$mean_fd[df_kp$p2 == "HdrR"],
                     col = "#F3B61F",
                     r0=0.3, r1 = 1,
                     lwd = lwd)
karyoploteR::kpLines(kp,
                     chr = df_kp$scaffold[df_kp$p2 == "HSOK"],
                     x = df_kp$mid[df_kp$p2 == "HSOK"],
                     y = df_kp$mean_fd[df_kp$p2 == "HSOK"],
                     col = "#631E68",
                     r0=0.3, r1 = 1,
                     lwd = lwd)
# Add SNP density data
kpPlotDensity(kp, data=mikk_ranges, col = "#49A379",
              r0=0, r1=0.1, 
              window.size = 25000)
kpPlotDensity(kp, data=ol_ranges_list$hni, col = "#F6673A",
              r0=0.1, r1=0.2, 
              window.size = 25000)
kpPlotDensity(kp, data=ol_ranges_list$hsok, col = "#631E68", 
              r0=0.2, r1=0.3, 
              window.size = 25000)
#kpPlotDensity(kp, data=ol_ranges_list$hdrr, col = "#F3B61F",
#              r0=0.45, r1=0.6,
#              window.size = 25000)
# Add exon density to ideogram
kpPlotDensity(kp, data=ex_ranges, col = "#f77cb5",
              data.panel = "ideogram",
              window.size = 25000,
              r0 = 0.5, r1 = 1)
kpPlotDensity(kp, data=ex_ranges, col = "#f77cb5",
              data.panel = "ideogram",
              window.size = 25000,
              r0 = 0.5, r1 = 0)
# Add labels
kpAddLabels(kp, labels="MIKK",
            r0=0, r1=0.05,
            label.margin = 0.001,
            cex = 0.5)
kpAddLabels(kp, labels="HNI",
            r0=0.1, r1=0.15, 
            label.margin = 0.001,
            cex = 0.5)
kpAddLabels(kp, labels="HSOK",
            r0=0.2, r1=0.25,
            label.margin = 0.001,
            cex = 0.5)

kpAddLabels(kp, labels=bquote(italic(f[d])),
            r0=0.3, r1=1, 
            label.margin = 0.035,
            cex = 1)
dev.off()
knitr::include_graphics(basename(out_file))
chr2
out_file = file.path(plots_dir, "20210409_fd_with_density_chr_2_500kb-window.png")
png(file=out_file,
    width=5500,
    height=1186,
    units = "px",
    res = 400)

# Plot 
kp = plotKaryotype(med_genome, chromosomes = "2", cex = 1.5)
# Add base numbers 
karyoploteR::kpAddBaseNumbers(kp, tick.dist = 5000000, cex = 0.7)
# Add data backgrounds
karyoploteR::kpDataBackground(kp, r0=0, r1 = 1, color = "white")
# Add axis label
kpAxis(kp, r0=0.3, r1 = 1, cex = 0.8)
# Add fd data
lwd = 2
karyoploteR::kpLines(kp,
                     chr = df_kp$scaffold[df_kp$p2 == "HNI"],
                     x = df_kp$mid[df_kp$p2 == "HNI"],
                     y = df_kp$mean_fd[df_kp$p2 == "HNI"],
                     col = "#F6673A",
                     r0=0.3, r1 = 1,
                     lwd = lwd)
karyoploteR::kpLines(kp,
                     chr = df_kp$scaffold[df_kp$p2 == "HdrR"],
                     x = df_kp$mid[df_kp$p2 == "HdrR"],
                     y = df_kp$mean_fd[df_kp$p2 == "HdrR"],
                     col = "#F3B61F",
                     r0=0.3, r1 = 1,
                     lwd = lwd)
karyoploteR::kpLines(kp,
                     chr = df_kp$scaffold[df_kp$p2 == "HSOK"],
                     x = df_kp$mid[df_kp$p2 == "HSOK"],
                     y = df_kp$mean_fd[df_kp$p2 == "HSOK"],
                     col = "#631E68",
                     r0=0.3, r1 = 1,
                     lwd = lwd)
# Add SNP density data
kpPlotDensity(kp, data=mikk_ranges, col = "#49A379",
              r0=0, r1=0.1, 
              window.size = 25000)
kpPlotDensity(kp, data=ol_ranges_list$hni, col = "#F6673A",
              r0=0.1, r1=0.2, 
              window.size = 25000)
kpPlotDensity(kp, data=ol_ranges_list$hsok, col = "#631E68", 
              r0=0.2, r1=0.3, 
              window.size = 25000)
#kpPlotDensity(kp, data=ol_ranges_list$hdrr, col = "#F3B61F",
#              r0=0.45, r1=0.6,
#              window.size = 25000)
# Add exon density to ideogram
kpPlotDensity(kp, data=ex_ranges, col = "#f77cb5",
              data.panel = "ideogram",
              window.size = 25000,
              r0 = 0.5, r1 = 1)
kpPlotDensity(kp, data=ex_ranges, col = "#f77cb5",
              data.panel = "ideogram",
              window.size = 25000,
              r0 = 0.5, r1 = 0)
# Add labels
kpAddLabels(kp, labels="MIKK",
            r0=0, r1=0.05,
            label.margin = 0.001,
            cex = 0.5)
kpAddLabels(kp, labels="HNI",
            r0=0.1, r1=0.15, 
            label.margin = 0.001,
            cex = 0.5)
kpAddLabels(kp, labels="HSOK",
            r0=0.2, r1=0.25,
            label.margin = 0.001,
            cex = 0.5)

kpAddLabels(kp, labels=bquote(italic(f[d])),
            r0=0.3, r1=1, 
            label.margin = 0.035,
            cex = 1)
dev.off()
knitr::include_graphics(basename(out_file))

Use chr4

Compose final figure

abba_diagram = here::here(plots_dir, "abba_diagram.png")
tree = here::here(plots_dir, "tree_oryzias_with_ancestor.png")
karyo_chr4 = here::here(plots_dir, "20210409_fd_with_density_chr_4_500kb-window.png")
circos_abba = here::here(plots_dir, "20210409_introgression_circos_MIKK_ABBA.png")

final_abba = ggdraw() +
  draw_image(tree,
            x = 0, y = .7, width = .4, height = .35, vjust = .1, hjust = -.1, scale = 1.2) +
  draw_image(karyo_chr4,
             x = 0, y = 0, width = 1, height = 0.3, scale = 1.2) +  
  draw_image(circos_abba,
             x = .4, y = .3, width = .6, height = .7, scale = 1.15, vjust = .025) +
  draw_image(abba_diagram,
          x = 0, y = .3, width = .4, height = .35, vjust = -.05, scale = 1.1) +
  draw_plot_label(label = c("A", "B", "C", "D"), size = 25,
                  x = c(0, 0, .45, 0), y = c(1, .7, 1, .3), color = "#4f0943")  


final_abba
ggsave(here::here(plots_dir, "20210409_introgression_final_figure.png"),
       width = 35,
       height = 21.875,
       units = "cm",
       dpi = 500)
---
title: "Introgression"
date: '`r format(Sys.Date())`'
#output:
#  html_document:
#    toc: true
#    toc_float: true
#    dev: 'svg'
#    number_sections: true
#    pandoc_args: --lua-filter=color-text.lua
#    highlight: pygments  
output: html_notebook
editor_options: 
  chunk_output_type: inline
---

We want to determine the extent to which the MIKK panel shows evidence of introgression with other medaka populations, specifically the allopatric Northern and Korean medaka strains, relative to sympatric Southern medaka strains.

# Setup

```{r, message = F}
library(here)
source(here::here("code", "scripts", "introgression", "source.R"))
```

## Working directory

Working directory on EBI cluster: `/hps/research1/birney/users/ian/mikk_paper`

## Create Singularity containers

```{bash, eval = F}
container_dir=../sing_conts
# load Singularity (version 3.5.0)
module load singularity 

for package in $( echo r-base_4.0.4 bcftools_1.9 numpy_1.15.4 bash_3.0.22 bioconductor_3.12 ) ; do
  if [[ ! -f $container_dir/$package.sif ]]; then
    singularity build \
      --remote \
      $container_dir/$package.sif \
      envs/$package.def 
  fi ;
done
```

## `renv`

Install all required packages (using r-base in Singularity container).

```{bash, eval = F}
container_dir=../sing_conts
baseR=r-base_4.0.4

bsub -Is "singularity shell $container_dir/$baseR"
```

```{r, eval = F}
# Install all required packages
renv::restore()
```

## Copy scripts from Simon Martin's GitHub repo 

```{bash, eval = F}
wget -P code/scripts/introgression https://raw.githubusercontent.com/simonhmartin/genomics_general/master/ABBABABAwindows.py

wget -P code/scripts/introgression https://raw.githubusercontent.com/simonhmartin/genomics_general/master/genomics.py
```

# Process multiple alignment data 

## Download from Ensembl

* [Ensembl 103]{color="purple"}: Only includes Oryzias latipes HdrR and Oryzias melastigma (Indian medaka).
* [Ensembl 102]{color="purple}: Includes all target species. Use. 

`ftp://ftp.ensembl.org/pub/release-102/emf/ensembl-compara/multiple_alignments/50_fish.epo/README.50_fish.epo` reads:

>Alignments are grouped by japanese medaka hdrr chromosome, and then by coordinate system.
Alignments containing duplications in japanese medaka hdrr are dumped once per duplicated segment.
The files named *.other*.emf contain alignments that do not include any japanese medaka hdrr
region. Each file contains up to 200 alignments.


```{bash, eval = F}
ftp_dir=ftp://ftp.ensembl.org/pub/release-102/emf/ensembl-compara/multiple_alignments/50_fish.epo/
target_dir=../introgression/release-102

mkdir -p $target_dir/raw

# download, exlcuding *.other* files
wget -P $target_dir/raw $ftp_dir/* -R "*other*"

# unzip into new directory (excluding "other")
$target_dir/unzipped
mkdir -p $target_dir/unzipped

for i in $(find $target_dir/raw/50_fish.epo.[0-9]*); do
  name=$(basename $i | cut -f3,4 -d'.');
  bsub "zcat $i > $target_dir/unzipped/$name";
done
```


* [**NOTE**]{color="red"}: File 6_2.emf is in a completely different format, with CIGAR strings instead of the normal SEQ, TREE, ID and DATA segments. It appears the file is corrupted. 
* The 6_2 file in `release 101` is unaffected. Remove all `release 102` chr 6 files and replace with `release 101` files.

```{bash, eval = F}
# remove release 102 files for chr 6
rm $target_dir/unzipped/6_*

# download chr 6 files from release 101
ftp_dir_101=ftp://ftp.ensembl.org/pub/release-101/emf/ensembl-compara/multiple_alignments/50_fish.epo/
target_dir_101=../introgression/release-101

mkdir -p $target_dir_101/raw

wget -P $target_dir_101/raw $ftp_dir_101/50_fish.epo.6_*

# unzip
mkdir -p $target_dir_101/unzipped

for i in $(find $target_dir_101/raw/*); do
  name=$(basename $i | cut -f3,4 -d'.') ;
  zcat $i > $target_dir_101/unzipped/$name ;
done

# copy over to release 102 directory
cp $target_dir_101/unzipped/* $target_dir/unzipped

```

## Generate tree plot

Copy tree to file

```{bash, eval = F}
tree_file=data/introgression/release_102_tree.txt

awk "NR==58,NR==205" $target_dir/raw/README.50_fish.epo \
  > $tree_file
```

Then manually edit `$tree_file` using regex to find spaces and replace them with "_":
`{bash} (?<=[a-z])( )(?=[a-z])`

```{r}
phylo_tree <- ape::read.tree(file = here::here("data", "introgression", "release_102_tree.txt"))
```

### Full tree 

```{r, fig.width=11, fig.height=12.5}
# Colour all Oryzias
ids <- phylo_tree$tip.label[grep("Oryzias", phylo_tree$tip.label)]
# get indices of edges descending from MRCA (determined through trial and error)
oryzias_nodes <- seq(35, 42)
all_med_col <- ifelse(1:length(phylo_tree[["edge.length"]]) %in% oryzias_nodes, "#E84141", "black")
# set colours for tip labels
all_med_tip <- ifelse(phylo_tree$tip.label %in% ids, "#E84141", "black")
# plot
ape::plot.phylo(phylo_tree,
                use.edge.length = T,
                edge.color = all_med_col,
                tip.color = all_med_tip,
                font = 4)
```

```{r, eval = F}
# Save to repo
png(file= file.path(plots_dir, "tree_all.png"),
    width=22,
    height=25,
    units = "cm",
    res = 400)
ape::plot.phylo(phylo_tree,
                use.edge.length = T,
                edge.color = all_med_col,
                tip.color = all_med_tip,
                font = 4)
dev.off()
```

### Oryzias only

New tree file created manually to extract *Oryzias* fishes only, and replace reference codes (e.g. "ASM223467v1") with line names (e.g. "HdrR").

```{r}
in_file = here::here("data/introgression/release_102_tree_oryzias_only.txt")
# Read in
phylo_tree <- ape::read.tree(file = in_file)
# Set colours 
phylo_cols <- c("#55b6b0", "#f33a56", "#f3b61f", "#f6673a", "#631e68")
# Plot
ape::plot.phylo(phylo_tree,
                font = 4,
                tip.color = phylo_cols)

```

```{r, eval = F}
out_file = here::here(plots_dir, "tree_oryzias.png")
# Save
png(file=out_file,
    width=2700,
    height=1720,
    units = "px",
    res = 400)
ape::plot.phylo(phylo_tree,
                font = 4,
                tip.color = phylo_cols)
dev.off()
```

#### Add cross for ancestor

```{r}
out_file = here::here(plots_dir, "tree_oryzias_with_ancestor.png")
```

```{r, eval = F}
in_file = here::here(plots_dir, "tree_oryzias.png")

tree_path = in_file

ggdraw() +
  draw_image(tree_path) +
  draw_label("x",
             x = 0.152,
             y = 0.52,
             fontface = "bold",
             color = "#f77cb5",
             size = 25)

ggsave(out_file,
    width=15.69,
    height=10,
    units = "cm",
    dpi = 400)
```


```{r, include = F}
# copy to same directory as current notebook
current_dir = dirname(rstudioapi::getSourceEditorContext()$path)

new_path = file.path(current_dir, basename(out_file))

if (file.exists(new_path) != T){
  file.copy(out_file, new_path)
}
```

```{r}
knitr::include_graphics(basename(out_file))
```

## Divide by segment

```{bash, eval = F}
target_dir=../introgression/release-102
segments_dir=$target_dir/segmented
date=20210312
script=code/scripts/introgression/20200907_extract-emf-segments.sh

mkdir -p $segments_dir

for i in $(find $target_dir/unzipped/* ); do
  # get basename
  bname=$(basename $i);
  bname_short=$(echo ${bname::-4} );
  # get chromosome
  chr=$(echo $bname | cut -f1 -d"_" );
  # make directory for each EMF file
  new_path=$(echo $segments_dir/$bname_short );
  if [ ! -d "$new_path" ]; then
    mkdir $new_path;
  fi
  # get segment count
  segment_count=$(grep "^DATA" $i | wc -l );
  # get segment start and end for each file
  for j in $(seq 1 $segment_count ); do
    bsub \
      -o ../log/$date\segment_$bname_short\_$j.out \
      -e ../log/$date\segment_$bname_short\_$j.err \
      "$script $i $j $new_path "
  done;  
done

# How many files?
find $segments_dir/*/*.data.txt | wc -l
# 8951
find $in_dir/*/*_1.data.txt | wc -l
# 4341
find $in_dir/*/*_-1.data.txt | wc -l
# 4610
```

## Run analysis pipeline with `snakemake`

```{bash, eval = F}
snmk_proj="introgression"

module load singularity
conda activate snakemake

snakemake \
 --jobs 5000 \
 --latency-wait 100 \
 --cluster-config code/snakemake/$snmk_proj/config/cluster.json \
 --cluster 'bsub -g /snakemake_bgenie -J {cluster.name} -n {cluster.n} -M {cluster.memory} -o {cluster.output} -e {cluster.error}' \
 --keep-going \
 --rerun-incomplete \
 --use-conda \
 --use-singularity \
 -s code/snakemake/$snmk_proj/Snakefile \
 -p
```

# *f* statistic analysis

## Read in data

```{r, results = 'asis'}
data_file = here::here("data", "introgression", "20210315_f_stat_final.txt")
# Read in data
final_df <- read.table(data_file,
                       header = T,
                       sep = "\t",
                       as.is = T)

final_df <- final_df %>% 
  dplyr::mutate(across(P2,
                       ~factor(.x, levels = fish_order))) %>% 
  dplyr::mutate(chr = factor(chr, levels = chr_order))

knitr::kable(head(final_df))
```


## Create DF with mean `melastigma` and `javanicus`

```{r, results = 'asis'}
cor_df <- final_df %>% 
  # filter for when P1 is another Oryzias, and P2 
  dplyr::filter(P1 %in% c("javanicus", "melastigma") & P2 != "MIKK" & P3 == "MIKK") %>% 
  # pivot to put the admixture_f stat for melastigma and javanicus in the same row
  tidyr::pivot_wider(id_cols = c("P2", "chr"),
                     names_from = P1,
                     values_from = c(admix_f, f_ci_lower, f_ci_upper))

cor_df$chr <- as.character(cor_df$chr)
cor_df$chr <- ifelse(cor_df$chr == "all", "genome-wide", cor_df$chr)
chr_order_plot <- c(seq(1,24), "genome-wide")
cor_df$chr <- factor(cor_df$chr, levels = chr_order_plot)

cor_df_means <- cor_df %>%
  # apply across rows
  dplyr::rowwise() %>% 
  # get means for f and CIs
  dplyr::mutate(mean_f = mean(c(admix_f_javanicus, admix_f_melastigma)),
                mean_ci_upper = mean(c(f_ci_upper_javanicus, f_ci_upper_melastigma)),
                mean_ci_lower = mean(c(f_ci_lower_javanicus, f_ci_lower_melastigma))) %>% 
  # set stats at a maximum of 1
  dplyr::mutate(across(c("mean_f", "mean_ci_upper", "mean_ci_lower"),
                       ~dplyr::if_else(.x > 1,
                                       1,
                                       .x)))

knitr::kable(head(cor_df_means))
```

## Plot

```{r}
fstat_plot = cor_df_means %>% 
  ggplot(aes(P2, mean_f, fill = P2)) +
    geom_col() +
    geom_errorbar(aes(ymin = mean_ci_lower,
                      ymax = mean_ci_upper),
                  position = position_dodge(0.9),
                  width = 0.25) +  
    guides(fill = F) +
    facet_wrap(~chr) +
    ylim(0,1) +
    ylab(expression(paste("Mean ", italic("f"), " statistic"))) +
    theme_cowplot(font_size = 8) +
    scale_fill_manual(values = pal_abba)

fstat_plot
```

```{r, eval = F}
out_file = here::here(plots_dir, "20210315_f_stat")

# PNG
ggsave(filename = paste(out_file, ".png", sep = ""),
       device = "png",
       width = 24.75,
       height = 19.5,
       units = "cm",
       dpi = 500)

# SVG
ggsave(filename = paste(out_file, ".svg", sep = ""),
       device = "svg",
       width = 24.75,
       height = 19.5,
       units = "cm")
```

# Sliding windows ABBA BABA

## Read in data

```{r, message = F}
in_file = here::here("data", "introgression", "abba_sliding_final", "50000_100.txt")
# Read in data
df = readr::read_csv(in_file) %>% 
  dplyr::arrange(p1, p2, scaffold, start)

# Convert fd to 0 if D < 0
df$fd = ifelse(df$D < 0,
               0,
               df$fd)

# Change names
df = df %>% 
  dplyr::mutate(p2 = recode(df$p2, hdrr = "HdrR", hni = "HNI", hsok = "HSOK"))


```

## Plot

### Standard

```{r, fig.width = 10, fig.height = 20}
df %>% 
  dplyr::filter(p1 == "melastigma") %>% 
  ggplot() +
    geom_line(aes(mid, fd, colour = p2)) +
    facet_wrap(~scaffold, nrow = 24, ncol = 1) +
    scale_colour_manual(values = pal_abba) +
    theme_bw(base_size = 10) +
    scale_x_continuous(breaks = c(0, 5000000, 10000000, 15000000, 20000000, 25000000, 30000000, 35000000),
                       labels = scales::comma) +
    xlab("Base position") +
    ylab(bquote(italic(f[d]))) +
    labs(colour = "P2")
```

```{r, eval = F}
out_file = here::here(plots_dir, "20210317_abba_sliding.png")

ggsave(filename = out_file,
       device = "png",
       width = 24.75,
       height = 50,
       units = "cm",
       dpi = 300)
```

### Karyoplot

#### Make custom chromosome scaffold

```{r}
# Get chromosome lengths
med_chr_lens = read.table(here("data", "Oryzias_latipes.ASM223467v1.dna.toplevel.fa_chr_counts.txt"),
                          col.names = c("chr", "end"))
# Add start
med_chr_lens$start = 1
# Reorder
med_chr_lens = med_chr_lens %>% 
  dplyr::select(chr, start, end)
# Create custom genome
med_genome = regioneR::toGRanges(med_chr_lens)
```

####  Process ABBA sliding windows data

```{r, message = F, results = 'asis'}
in_file = here::here("data", "introgression", "abba_sliding_final", "50000_100.txt")
# Read in data
df = readr::read_csv(in_file) %>% 
  dplyr::arrange(p1, p2, scaffold, start)

# Convert fd to 0 if D < 0
df$fd = ifelse(df$D < 0,
               0,
               df$fd)

# Change names
df = df %>% 
  dplyr::mutate(p2 = recode(df$p2, hdrr = "HdrR", hni = "HNI", hsok = "HSOK"))

# Get df with mean of melastigma/javanicus
df_kp = df %>% 
  pivot_wider(id_cols = c(scaffold, start, end, mid, p2), names_from = p1, values_from = fd) %>%
  # get mean of melastigma/javanicus
  dplyr::mutate(mean_fd = rowMeans(dplyr::select(., melastigma, javanicus), na.rm = T)) %>% 
  dplyr::arrange(p2, scaffold, start)

knitr::kable(head(df_kp))
```

#### Read in SNP density data

##### HNI and HSOK

```{r, eval = F}
in_file = here::here("data/introgression/hni_hsok.txt.gz")
# Read in file on local
ol_ranges_df = read.table(in_file,
                          header = T, 
                          sep = "\t", 
                          as.is = T)

ol_ranges_df_long = ol_ranges_df %>% 
  tidyr::pivot_longer(cols = c(hni, hsok), names_to = "line", values_to = "present")

ol_ranges_list = split(ol_ranges_df_long, f = ol_ranges_df_long$line)

ol_ranges_list = lapply(ol_ranges_list, function(x){
  # remove NAs
  df = x %>% 
    tidyr::drop_na(present)
  # convert to GRanges object
  ol_ranges = GenomicRanges::makeGRangesFromDataFrame(df,
                                                      ignore.strand = T,
                                                      seqnames.field = "chr",
                                                      start.field = "pos",
                                                      end.field = "pos")
  return(ol_ranges)
})

```

```{r, eval = F, include = F}
# Save to speed up when rendering HTML
saveRDS(ol_ranges_list, here::here("data/introgression/hni_hsok_range.rds"))
```

```{r, include = F}
ol_ranges_list = readRDS(here::here("data/introgression/hni_hsok_range.rds"))
```

##### MIKK

```{r, eval = F}
in_file = here::here("data/introgression/mikk.txt.gz")
# Read in file on local
mikk_ranges_df = read.table(in_file,
                            col.names = c("chr", "pos"), 
                            sep = "\t", 
                            as.is = T)

# Convert to GRanges object

mikk_ranges = GenomicRanges::makeGRangesFromDataFrame(mikk_ranges_df,
                                                      ignore.strand = T,
                                                      seqnames.field = "chr",
                                                      start.field = "pos",
                                                      end.field = "pos")
```

```{r, eval = F, include = F}
# Save to speed up when rendering HTML
saveRDS(mikk_ranges, here::here("data/introgression/mikk_range.rds"))
```

```{r, include = F}
mikk_ranges = readRDS(here::here("data/introgression/mikk_range.rds"))
```

#### Get exon density

```{r, eval = F}
# Get list of exons from biomaRt

## Select dataset
olat_mart = biomaRt::useEnsembl(biomart = "ensembl", dataset = "olatipes_gene_ensembl")
## Get attributes of interest (exon ID, chr, start, end)
exons <- biomaRt::getBM(attributes = c("chromosome_name", "ensembl_gene_id", "ensembl_transcript_id", "transcript_start", "transcript_end", "transcript_length", "ensembl_exon_id", "rank", "strand", "exon_chrom_start", "exon_chrom_end", "cds_start", "cds_end"),
               mart = olat_mart)
## Convert exons to GRanges
ex_ranges = GenomicRanges::makeGRangesFromDataFrame(exons,
                                                    ignore.strand = T,
                                                    seqnames.field = "chromosome_name",
                                                    start.field = "exon_chrom_start",
                                                    end.field = "exon_chrom_end")
```

```{r, eval = F, include = F}
# Save to speed up when rendering HTML
saveRDS(ex_ranges, here::here("data/introgression/ex_range.rds"))
```

```{r, include = F}
ex_ranges = readRDS(here::here("data/introgression/ex_range.rds"))
```

#### All chromosomes

```{r}
file_out = file.path(plots_dir, "20210318_fd_with_density_all.png")
```


```{r, eval = F}
# Save
png(file=file_out,
    width=8500,
    height=13500,
    units = "px",
    res = 400)
# Plot 
kp = plotKaryotype(med_genome)
# Add base numbers 
karyoploteR::kpAddBaseNumbers(kp, tick.dist = 5000000, cex = 0.3)
# Add data backgrounds
karyoploteR::kpDataBackground(kp, r0=0, r1 = 1, color = "white")
# Add axis label
kpAxis(kp, r0=0.3, r1 = 1, cex = 0.4)
# Add fd data
karyoploteR::kpLines(kp,
                     chr = df_kp$scaffold[df_kp$p2 == "HNI"],
                     x = df_kp$mid[df_kp$p2 == "HNI"],
                     y = df_kp$mean_fd[df_kp$p2 == "HNI"],
                     col = "#F6673A",
                     r0=0.3, r1 = 1)
karyoploteR::kpLines(kp,
                     chr = df_kp$scaffold[df_kp$p2 == "HdrR"],
                     x = df_kp$mid[df_kp$p2 == "HdrR"],
                     y = df_kp$mean_fd[df_kp$p2 == "HdrR"],
                     col = "#F3B61F",
                     r0=0.3, r1 = 1)
karyoploteR::kpLines(kp,
                     chr = df_kp$scaffold[df_kp$p2 == "HSOK"],
                     x = df_kp$mid[df_kp$p2 == "HSOK"],
                     y = df_kp$mean_fd[df_kp$p2 == "HSOK"],
                     col = "#631E68",
                     r0=0.3, r1 = 1)
# Add SNP density data
kpPlotDensity(kp, data=mikk_ranges, col = "#49A379",
              r0=0, r1=0.1, 
              window.size = 25000)
kpPlotDensity(kp, data=ol_ranges_list$hni, col = "#F6673A",
              r0=0.1, r1=0.2, 
              window.size = 25000)
kpPlotDensity(kp, data=ol_ranges_list$hsok, col = "#631E68", 
              r0=0.2, r1=0.3, 
              window.size = 25000)
#kpPlotDensity(kp, data=ol_ranges_list$hdrr, col = "#F3B61F",
#              r0=0.45, r1=0.6,
#              window.size = 25000)
# Add exon density to ideogram
kpPlotDensity(kp, data=ex_ranges, col = "#f77cb5",
              data.panel = "ideogram",
              window.size = 25000,
              r0 = 0.5, r1 = 1)
kpPlotDensity(kp, data=ex_ranges, col = "#f77cb5",
              data.panel = "ideogram",
              window.size = 25000,
              r0 = 0.5, r1 = 0)
# Add labels
kpAddLabels(kp, labels="MIKK",
            r0=0, r1=0.05,
            label.margin = 0.001,
            cex = 0.4)
kpAddLabels(kp, labels="HNI",
            r0=0.1, r1=0.15, 
            label.margin = 0.001,
            cex = 0.4)
kpAddLabels(kp, labels="HSOK",
            r0=0.2, r1=0.25,
            label.margin = 0.001,
            cex = 0.4)
#kpAddLabels(kp, labels="HdrR",
#            r0=0.45, r1=0.6, 
#            cex = 0.4)
kpAddLabels(kp, labels=bquote(italic(f[d])),
            r0=0.3, r1=1, 
            label.margin = 0.035,
            cex = 0.6)
dev.off()
```

```{r, include = F}
# copy to same directory as current notebook
current_dir = dirname(rstudioapi::getSourceEditorContext()$path)

new_path = file.path(current_dir, basename(file_out))

if (file.exists(new_path) != T){
  file.copy(file_out, new_path)
}
```

```{r}
knitr::include_graphics(basename(file_out))
```

#### Chromosome 4

```{r}
out_file = file.path(plots_dir, "20210318_fd_with_density_chr_4.png")
```


```{r, eval = F}
png(file=out_file,
    width=5500,
    height=1186,
    units = "px",
    res = 400)

# Plot 
kp = plotKaryotype(med_genome, chromosomes = "4", cex = 1.5)
# Add base numbers 
karyoploteR::kpAddBaseNumbers(kp, tick.dist = 5000000, cex = 0.7)
# Add data backgrounds
karyoploteR::kpDataBackground(kp, r0=0, r1 = 1, color = "white")
# Add axis label
kpAxis(kp, r0=0.3, r1 = 1, cex = 0.8)
# Add fd data
lwd = 2
karyoploteR::kpLines(kp,
                     chr = df_kp$scaffold[df_kp$p2 == "HNI"],
                     x = df_kp$mid[df_kp$p2 == "HNI"],
                     y = df_kp$mean_fd[df_kp$p2 == "HNI"],
                     col = "#F6673A",
                     r0=0.3, r1 = 1,
                     lwd = lwd)
karyoploteR::kpLines(kp,
                     chr = df_kp$scaffold[df_kp$p2 == "HdrR"],
                     x = df_kp$mid[df_kp$p2 == "HdrR"],
                     y = df_kp$mean_fd[df_kp$p2 == "HdrR"],
                     col = "#F3B61F",
                     r0=0.3, r1 = 1,
                     lwd = lwd)
karyoploteR::kpLines(kp,
                     chr = df_kp$scaffold[df_kp$p2 == "HSOK"],
                     x = df_kp$mid[df_kp$p2 == "HSOK"],
                     y = df_kp$mean_fd[df_kp$p2 == "HSOK"],
                     col = "#631E68",
                     r0=0.3, r1 = 1,
                     lwd = lwd)
# Add SNP density data
kpPlotDensity(kp, data=mikk_ranges, col = "#49A379",
              r0=0, r1=0.1, 
              window.size = 25000)
kpPlotDensity(kp, data=ol_ranges_list$hni, col = "#F6673A",
              r0=0.1, r1=0.2, 
              window.size = 25000)
kpPlotDensity(kp, data=ol_ranges_list$hsok, col = "#631E68", 
              r0=0.2, r1=0.3, 
              window.size = 25000)
#kpPlotDensity(kp, data=ol_ranges_list$hdrr, col = "#F3B61F",
#              r0=0.45, r1=0.6,
#              window.size = 25000)
# Add exon density to ideogram
kpPlotDensity(kp, data=ex_ranges, col = "#f77cb5",
              data.panel = "ideogram",
              window.size = 25000,
              r0 = 0.5, r1 = 1)
kpPlotDensity(kp, data=ex_ranges, col = "#f77cb5",
              data.panel = "ideogram",
              window.size = 25000,
              r0 = 0.5, r1 = 0)
# Add labels
kpAddLabels(kp, labels="MIKK",
            r0=0, r1=0.05,
            label.margin = 0.001,
            cex = 0.5)
kpAddLabels(kp, labels="HNI",
            r0=0.1, r1=0.15, 
            label.margin = 0.001,
            cex = 0.5)
kpAddLabels(kp, labels="HSOK",
            r0=0.2, r1=0.25,
            label.margin = 0.001,
            cex = 0.5)
#kpAddLabels(kp, labels="HdrR",
#            r0=0.45, r1=0.6, 
#            cex = 0.4)
kpAddLabels(kp, labels=bquote(italic(f[d])),
            r0=0.3, r1=1, 
            label.margin = 0.035,
            cex = 1)
dev.off()
```

```{r, include = F}
# copy to same directory as current notebook
current_dir = dirname(rstudioapi::getSourceEditorContext()$path)

new_path = file.path(current_dir, basename(out_file))

if (file.exists(new_path) != T){
  file.copy(out_file, new_path)
}
```

```{r}
knitr::include_graphics(basename(out_file))
```

# Final figure

## ABBA BABA diagram

Created with [Vectr](https://vectr.com/) and saved here: `plots/introgression/20210318_abba_diagram.svg`

## Compile all

```{r, fig.width=14, fig.height=8.8}
abba_diagram = here::here(plots_dir, "abba_diagram.png")
tree = here::here(plots_dir, "tree_oryzias_with_ancestor.png")
karyo_chr4 = here::here(plots_dir, "20210318_fd_with_density_chr_4.png")

final_abba = ggdraw() +
  draw_image(tree,
            x = 0, y = .7, width = .4, height = .35, vjust = .1, hjust = -.1, scale = 1.2) +
  draw_image(karyo_chr4,
             x = 0, y = 0, width = 1, height = 0.3, scale = 1.2) +   
  draw_plot(fstat_plot,
             x = .4, y = .3, width = .6, height = .7) +
  draw_image(abba_diagram,
          x = 0, y = .3, width = .4, height = .35, vjust = -.05, scale = 1.1) +
  draw_plot_label(label = c("A", "B", "C", "D"), size = 16,
                  x = c(0, 0, .38, 0), y = c(1, .7, 1, .3))  


final_abba


```
```{r, eval = F}
ggsave(here::here(plots_dir, "20210318_final_figure.png"),
       width = 35,
       height = 21.875,
       units = "cm",
       dpi = 500)
```

# New final figure with circos

## Circos

### Read in data

```{r, message = F}
target_file = here::here("data/introgression/abba_sliding_final_with_icab/min-sites-250.txt")
```

Sanity check with counts of sites:

```{r}
readr::read_csv(target_file) %>% 
  # add sliding window length
  dplyr::mutate(window_length_kb = (end - start + 1) / 1000) %>% 
  # filter for 500 kb windows
  dplyr::filter(window_length_kb == 500) %>% 
  dplyr::count(p1, p2)
```


As suggested by Simon Martin here: <https://github.com/simonhmartin/genomics_general#abba-baba-statistics-in-sliding-windows>
> fd gives meaningless values (<0 or >1) if D is negative. If there is no excess of shared derived alleles between P2 and P3 (indicated by a positive D), then the excess cannot be quantified. fd values for windows with negative D should therefore either be discarded or converted to zero, depending on your hypothesis.

#### How many windows have D > 0? 

```{r, message = F}
readr::read_csv(target_file) %>% 
  # add sliding window length
  dplyr::mutate(window_length_kb = (end - start + 1) / 1000) %>% 
  # filter for 500 kb windows
  dplyr::filter(window_length_kb == 500) %>% 
  # recode lines
  dplyr::mutate(across(c("p1", "p2"), ~factor(.x, levels = c("icab", "hdrr", "hni", "hsok"))),
                across(c("p1", "p2"), ~recode(.x, icab = "iCab", hdrr = "HdrR", hni = "HNI", hsok = "HSOK"))) %>% 
  dplyr::group_by(p1, p2) %>% 
  dplyr::summarise(n_pos_d = length(which(D > 0))) %>% 
  ggplot() +
    geom_col(aes(p2, n_pos_d, fill = p2)) +
    facet_wrap(~p1) +
    scale_fill_manual(values = pal_abba) +
    xlab("P2") +
    ylab("Number of windows (sites) with positive D") +
    ggtitle("Choice of P1 (iCab or HdrR)") +
    theme_cowplot() +
    labs(fill = "P2")
```

```{r, eval = F}
ggsave(here::here("plots/introgression/20210429_p1_hdrr_v_icab_counts_valid_sites.png"),
       device = "png",
       width=15.69,
       height=10,
       units = "cm",
       dpi = 400)
```

#### Distributions of $f_D$

```{r, message = F}
readr::read_csv(target_file) %>% 
  # add sliding window length
  dplyr::mutate(window_length_kb = (end - start + 1) / 1000) %>% 
  # filter for 500 kb windows
  dplyr::filter(window_length_kb == 500) %>% 
  # recode lines
  dplyr::mutate(across(c("p1", "p2"), ~factor(.x, levels = c("icab", "hdrr", "hni", "hsok"))),
                across(c("p1", "p2"), ~recode(.x, icab = "iCab", hdrr = "HdrR", hni = "HNI", hsok = "HSOK"))) %>%
  # remove all rows where D < 0
  dplyr::filter(D > 0) %>% 
  ggplot() +
    geom_histogram(aes(fd, fill = p2), bins = 50) +
    facet_wrap(vars(p1, p2)) +
    scale_fill_manual(values = pal_abba) +
    xlab("P2") +
#    ylab("Number of windows (sites) with positive D") +
    ggtitle(expression(italic(f[d]))) +
    theme_cowplot()  

```
```{r, eval = F}
ggsave(here::here("plots/introgression/20210429_p1_hdrr_v_icab_fd_distribution.png"),
       device = "png",
       width=15.69,
       height=10,
       units = "cm",
       dpi = 400)
```

#### Get mean fd for each population

```{r, message = F}
readr::read_csv(target_file) %>% 
  # add sliding window length
  dplyr::mutate(window_length_kb = (end - start + 1) / 1000) %>% 
  # filter for 500 kb windows
  dplyr::filter(window_length_kb == 500) %>% 
  # recode lines
  dplyr::mutate(across(c("p1", "p2"), ~factor(.x, levels = c("icab", "hdrr", "hni", "hsok"))),
                across(c("p1", "p2"), ~recode(.x, icab = "iCab", hdrr = "HdrR", hni = "HNI", hsok = "HSOK"))) %>%
  # remove all rows where D < 0
  dplyr::filter(D > 0) %>% 
  dplyr::filter(p1 == "HdrR") %>% 
  dplyr::group_by(p2) %>% 
  dplyr::summarise(mean(fd))
```


```{r, message = F}
mikk_abba_final = readr::read_csv(target_file) %>% 
  # add sliding window length
  dplyr::mutate(window_length_kb = (end - start + 1) / 1000) %>% 
  # filter for 500 kb windows
  dplyr::filter(window_length_kb == 500) %>% 
  # recode `fd` as 0 if `D` is negative
  dplyr::mutate(fd = ifelse(D < 0, 0, fd))

# Is iCab or HdrR closer to MIKK?
mikk_abba_final %>% 
  dplyr::filter(p2 %in% c("icab", "hdrr")) %>% 
  dplyr::group_by(p1, p2) %>% 
  dplyr::summarise(length(which(fd == 0)))

# So there are fewer 0s when HdrR is P1, which suggests that iCab is more closely related to the MIKK panel. But you want a P1 that is further away so that you'll get more data points, which is why we'll likely go with HdrR. 
```


```{r, message = F}
mikk_abba_final = mikk_abba_final %>% 
  # recode lines
  dplyr::mutate(p2 = factor(p2, levels = c("hdrr", "icab", "hni", "hsok")),
                p2 = recode(p2, hdrr = "HdrR", icab = "iCab", hni = "HNI", hsok = "HSOK")) %>% 
  dplyr::arrange(p2, scaffold, start) %>% 
  dplyr::select(scaffold, mid_1 = mid, mid_2 = mid, fd, p1, p2) %>% 
  dplyr::mutate(scaffold = paste("chr", scaffold, sep ="")) %>% 
  split(., f = .$p1) %>% 
  purrr::map(., function(P1) split(P1, f = P1$p2)) %>% 
  # Remove empty data frames for hdrr-hdrr and icab-icab population combinations
  purrr::map(., function(P1) P1[purrr::map_lgl(P1, function(P2) nrow(P2) != 0)])

```

### Plot

```{r}
out_plot = here::here("plots", "introgression", "20210427_introgression_circos_MIKK_ABBA_p1-hdrr.png")
```

```{r, eval = F}
target_list = mikk_abba_final[["hdrr"]]

png(out_plot,
    width = 20,
    height = 20,
    units = "cm",
    res = 500)

# Set parameters
## Decrease cell padding from default c(0.02, 1.00, 0.02, 1.00)
circos.par(cell.padding = c(0, 0, 0, 0),
           track.margin = c(0, 0),
           gap.degree = c(rep(1, nrow(chroms) - 1), 6))
# Initialize plot
circos.initializeWithIdeogram(chroms,
                              plotType = c("axis", "labels"),
                              major.by = 1e7,
                              axis.labels.cex = 0.25*par("cex"))

# Print label in center
text(0, 0, "MIKK panel\nintrogression with\niCab, HNI,\nand\nHSOK")

###############
# Introgression
###############
counter = 0

purrr::map(target_list, function(P2){
  # Set counter
  counter <<- counter + 1
  
  circos.genomicTrack(P2,
      panel.fun = function(region, value, ...){
        circos.genomicLines(region,
                            value[[1]],
                            col = pal_abba[[names(target_list[counter])]],
                            area = T,
                            border = karyoploteR::darker(pal_abba[[names(target_list[counter])]],
                                                         amount = 80))
        # Add baseline
        circos.xaxis(h = "bottom",
                     labels = F,
                     major.tick = F)
      },
      track.height = 0.1,
      bg.border = NA,
      ylim = c(0, 1))
  
  # Add axis for introgression
  circos.yaxis(side = "right",
             at = c(.5, 1),
             labels.cex = 0.25*par("cex"),
             tick.length = 2
             )
  
  # Add y-axis label for introgression
  if (counter == 2) {
  circos.text(0, 0.5,
              labels = expression(italic(f[d])),
              sector.index = "chr1",
#              facing = "clockwise",
              adj = c(3, 0.5),
              cex = 0.4*par("cex"))
  }
  
  # Add y-axis label for introgression
  circos.text(0, 0.5,
              labels = names(target_list)[counter],
              sector.index = "chr1",
              facing = "clockwise",
              adj = c(.5, 0),
              cex = 0.6*par("cex"))  
})

circos.clear()

dev.off()
```

```{r, include = F}
# copy to same directory as current notebook
current_dir = dirname(rstudioapi::getSourceEditorContext()$path)

new_path = file.path(current_dir, basename(out_plot))

file.copy(out_plot, new_path, overwrite = T)
```

```{r}
knitr::include_graphics(basename(out_plot))
```
```{r}
out_plot = here::here("plots", "introgression", "20210427_introgression_circos_MIKK_ABBA_p1-icab.png")
```

```{r, eval = F}
target_list = mikk_abba_final[["icab"]]

png(out_plot,
    width = 20,
    height = 20,
    units = "cm",
    res = 500)

# Set parameters
## Decrease cell padding from default c(0.02, 1.00, 0.02, 1.00)
circos.par(cell.padding = c(0, 0, 0, 0),
           track.margin = c(0, 0),
           gap.degree = c(rep(1, nrow(chroms) - 1), 6))
# Initialize plot
circos.initializeWithIdeogram(chroms,
                              plotType = c("axis", "labels"),
                              major.by = 1e7,
                              axis.labels.cex = 0.25*par("cex"))

# Print label in center
text(0, 0, "MIKK panel\nintrogression with\nHdrR, HNI,\nand\nHSOK")

###############
# Introgression
###############
counter = 0

purrr::map(target_list, function(P2){
  # Set counter
  counter <<- counter + 1
  
  circos.genomicTrack(P2,
      panel.fun = function(region, value, ...){
        circos.genomicLines(region,
                            value[[1]],
                            col = pal_abba[[names(target_list[counter])]],
                            area = T,
                            border = karyoploteR::darker(pal_abba[[names(target_list[counter])]]))
        # Add baseline
        circos.xaxis(h = "bottom",
                     labels = F,
                     major.tick = F)
      },
      track.height = 0.1,
      bg.border = NA,
      ylim = c(0, 1))
  
  # Add axis for introgression
  circos.yaxis(side = "right",
             at = c(.5, 1),
             labels.cex = 0.25*par("cex"),
             tick.length = 2
             )
  
  # Add y-axis label for introgression
  if (counter == 2) {
  circos.text(0, 0.5,
              labels = expression(italic(f[d])),
              sector.index = "chr1",
#              facing = "clockwise",
              adj = c(3, 0.5),
              cex = 0.4*par("cex"))
  }
  
  # Add y-axis label for introgression
  circos.text(0, 0.5,
              labels = names(target_list)[counter],
              sector.index = "chr1",
              facing = "clockwise",
              adj = c(.5, 0),
              cex = 0.6*par("cex"))  
})

circos.clear()

dev.off()
```

```{r, include = F}
# copy to same directory as current notebook
current_dir = dirname(rstudioapi::getSourceEditorContext()$path)

new_path = file.path(current_dir, basename(out_plot))

file.copy(out_plot, new_path, overwrite = T)
```

```{r}
knitr::include_graphics(basename(out_plot))
```

## Re-do final figure

### Chr2

#### Read in new data

```{r, message = F, results = 'asis'}
in_file = here::here("data/introgression/abba_sliding_final_no_131-1", "500000_250.txt")
# Read in data
df = readr::read_csv(in_file) %>% 
  dplyr::arrange(p1, p2, scaffold, start)

# Convert fd to 0 if D < 0
df$fd = ifelse(df$D < 0,
               0,
               df$fd)

# Change names
df = df %>% 
  dplyr::mutate(p2 = recode(df$p2, hdrr = "HdrR", hni = "HNI", hsok = "HSOK"))

# Get df with mean of melastigma/javanicus
df_kp = df %>% 
  pivot_wider(id_cols = c(scaffold, start, end, mid, p2), names_from = p1, values_from = fd) %>%
  # get mean of melastigma/javanicus
  dplyr::mutate(mean_fd = rowMeans(dplyr::select(., melastigma, javanicus), na.rm = T)) %>% 
  dplyr::arrange(p2, scaffold, start)

knitr::kable(head(df_kp))
```

#### Plot

##### chr4

```{r}
out_file = file.path(plots_dir, "20210409_fd_with_density_chr_4_500kb-window.png")
```

```{r, eval = F}
png(file=out_file,
    width=5500,
    height=1186,
    units = "px",
    res = 400)

# Plot 
kp = plotKaryotype(med_genome, chromosomes = "4", cex = 1.5)
# Add base numbers 
karyoploteR::kpAddBaseNumbers(kp, tick.dist = 5000000, cex = 0.7)
# Add data backgrounds
karyoploteR::kpDataBackground(kp, r0=0, r1 = 1, color = "white")
# Add axis label
kpAxis(kp, r0=0.3, r1 = 1, cex = 0.8)
# Add fd data
lwd = 2
karyoploteR::kpLines(kp,
                     chr = df_kp$scaffold[df_kp$p2 == "HNI"],
                     x = df_kp$mid[df_kp$p2 == "HNI"],
                     y = df_kp$mean_fd[df_kp$p2 == "HNI"],
                     col = "#F6673A",
                     r0=0.3, r1 = 1,
                     lwd = lwd)
karyoploteR::kpLines(kp,
                     chr = df_kp$scaffold[df_kp$p2 == "HdrR"],
                     x = df_kp$mid[df_kp$p2 == "HdrR"],
                     y = df_kp$mean_fd[df_kp$p2 == "HdrR"],
                     col = "#F3B61F",
                     r0=0.3, r1 = 1,
                     lwd = lwd)
karyoploteR::kpLines(kp,
                     chr = df_kp$scaffold[df_kp$p2 == "HSOK"],
                     x = df_kp$mid[df_kp$p2 == "HSOK"],
                     y = df_kp$mean_fd[df_kp$p2 == "HSOK"],
                     col = "#631E68",
                     r0=0.3, r1 = 1,
                     lwd = lwd)
# Add SNP density data
kpPlotDensity(kp, data=mikk_ranges, col = "#49A379",
              r0=0, r1=0.1, 
              window.size = 25000)
kpPlotDensity(kp, data=ol_ranges_list$hni, col = "#F6673A",
              r0=0.1, r1=0.2, 
              window.size = 25000)
kpPlotDensity(kp, data=ol_ranges_list$hsok, col = "#631E68", 
              r0=0.2, r1=0.3, 
              window.size = 25000)
#kpPlotDensity(kp, data=ol_ranges_list$hdrr, col = "#F3B61F",
#              r0=0.45, r1=0.6,
#              window.size = 25000)
# Add exon density to ideogram
kpPlotDensity(kp, data=ex_ranges, col = "#f77cb5",
              data.panel = "ideogram",
              window.size = 25000,
              r0 = 0.5, r1 = 1)
kpPlotDensity(kp, data=ex_ranges, col = "#f77cb5",
              data.panel = "ideogram",
              window.size = 25000,
              r0 = 0.5, r1 = 0)
# Add labels
kpAddLabels(kp, labels="MIKK",
            r0=0, r1=0.05,
            label.margin = 0.001,
            cex = 0.5)
kpAddLabels(kp, labels="HNI",
            r0=0.1, r1=0.15, 
            label.margin = 0.001,
            cex = 0.5)
kpAddLabels(kp, labels="HSOK",
            r0=0.2, r1=0.25,
            label.margin = 0.001,
            cex = 0.5)

kpAddLabels(kp, labels=bquote(italic(f[d])),
            r0=0.3, r1=1, 
            label.margin = 0.035,
            cex = 1)
dev.off()
```

```{r, include = F}
# copy to same directory as current notebook
current_dir = dirname(rstudioapi::getSourceEditorContext()$path)

new_path = file.path(current_dir, basename(out_file))

file.copy(out_file, new_path, overwrite = T)

```

```{r}
knitr::include_graphics(basename(out_file))
```
##### chr2

```{r}
out_file = file.path(plots_dir, "20210409_fd_with_density_chr_2_500kb-window.png")
```

```{r, eval = F}
png(file=out_file,
    width=5500,
    height=1186,
    units = "px",
    res = 400)

# Plot 
kp = plotKaryotype(med_genome, chromosomes = "2", cex = 1.5)
# Add base numbers 
karyoploteR::kpAddBaseNumbers(kp, tick.dist = 5000000, cex = 0.7)
# Add data backgrounds
karyoploteR::kpDataBackground(kp, r0=0, r1 = 1, color = "white")
# Add axis label
kpAxis(kp, r0=0.3, r1 = 1, cex = 0.8)
# Add fd data
lwd = 2
karyoploteR::kpLines(kp,
                     chr = df_kp$scaffold[df_kp$p2 == "HNI"],
                     x = df_kp$mid[df_kp$p2 == "HNI"],
                     y = df_kp$mean_fd[df_kp$p2 == "HNI"],
                     col = "#F6673A",
                     r0=0.3, r1 = 1,
                     lwd = lwd)
karyoploteR::kpLines(kp,
                     chr = df_kp$scaffold[df_kp$p2 == "HdrR"],
                     x = df_kp$mid[df_kp$p2 == "HdrR"],
                     y = df_kp$mean_fd[df_kp$p2 == "HdrR"],
                     col = "#F3B61F",
                     r0=0.3, r1 = 1,
                     lwd = lwd)
karyoploteR::kpLines(kp,
                     chr = df_kp$scaffold[df_kp$p2 == "HSOK"],
                     x = df_kp$mid[df_kp$p2 == "HSOK"],
                     y = df_kp$mean_fd[df_kp$p2 == "HSOK"],
                     col = "#631E68",
                     r0=0.3, r1 = 1,
                     lwd = lwd)
# Add SNP density data
kpPlotDensity(kp, data=mikk_ranges, col = "#49A379",
              r0=0, r1=0.1, 
              window.size = 25000)
kpPlotDensity(kp, data=ol_ranges_list$hni, col = "#F6673A",
              r0=0.1, r1=0.2, 
              window.size = 25000)
kpPlotDensity(kp, data=ol_ranges_list$hsok, col = "#631E68", 
              r0=0.2, r1=0.3, 
              window.size = 25000)
#kpPlotDensity(kp, data=ol_ranges_list$hdrr, col = "#F3B61F",
#              r0=0.45, r1=0.6,
#              window.size = 25000)
# Add exon density to ideogram
kpPlotDensity(kp, data=ex_ranges, col = "#f77cb5",
              data.panel = "ideogram",
              window.size = 25000,
              r0 = 0.5, r1 = 1)
kpPlotDensity(kp, data=ex_ranges, col = "#f77cb5",
              data.panel = "ideogram",
              window.size = 25000,
              r0 = 0.5, r1 = 0)
# Add labels
kpAddLabels(kp, labels="MIKK",
            r0=0, r1=0.05,
            label.margin = 0.001,
            cex = 0.5)
kpAddLabels(kp, labels="HNI",
            r0=0.1, r1=0.15, 
            label.margin = 0.001,
            cex = 0.5)
kpAddLabels(kp, labels="HSOK",
            r0=0.2, r1=0.25,
            label.margin = 0.001,
            cex = 0.5)

kpAddLabels(kp, labels=bquote(italic(f[d])),
            r0=0.3, r1=1, 
            label.margin = 0.035,
            cex = 1)
dev.off()
```

```{r, include = F}
# copy to same directory as current notebook
current_dir = dirname(rstudioapi::getSourceEditorContext()$path)

new_path = file.path(current_dir, basename(out_file))

file.copy(out_file, new_path, overwrite = T)
```

```{r}
knitr::include_graphics(basename(out_file))
```
Use chr4

### Compose final figure

```{r}
abba_diagram = here::here(plots_dir, "abba_diagram.png")
tree = here::here(plots_dir, "tree_oryzias_with_ancestor.png")
karyo_chr4 = here::here(plots_dir, "20210409_fd_with_density_chr_4_500kb-window.png")
circos_abba = here::here(plots_dir, "20210409_introgression_circos_MIKK_ABBA.png")

final_abba = ggdraw() +
  draw_image(tree,
            x = 0, y = .7, width = .4, height = .35, vjust = .1, hjust = -.1, scale = 1.2) +
  draw_image(karyo_chr4,
             x = 0, y = 0, width = 1, height = 0.3, scale = 1.2) +  
  draw_image(circos_abba,
             x = .4, y = .3, width = .6, height = .7, scale = 1.15, vjust = .025) +
  draw_image(abba_diagram,
          x = 0, y = .3, width = .4, height = .35, vjust = -.05, scale = 1.1) +
  draw_plot_label(label = c("A", "B", "C", "D"), size = 25,
                  x = c(0, 0, .45, 0), y = c(1, .7, 1, .3), color = "#4f0943")  


final_abba
```

```{r, eval = F}
ggsave(here::here(plots_dir, "20210409_introgression_final_figure.png"),
       width = 35,
       height = 21.875,
       units = "cm",
       dpi = 500)
```

